from bs4 import BeautifulSoup
from urllib import request
import xlwt
import os
import pymysql

global j
j = 1
global u
u = 1
global p
p = 1

for i in range(0,10):
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'}
    a = request.Request(url='https://movie.douban.com/top250?start=' + str(i * 25),headers = headers)
    req = request.urlopen(a)
    html = req.read()
    html = html.decode("utf-8")

    soup = BeautifulSoup(html,"lxml")
    div_list1 = soup.find_all('div', attrs={"class": "hd"})
    div_list2 = soup.find_all('div', attrs={"class":"bd"})
    pic_list = soup.find_all('div', attrs={"class": "pic"})

    for each in div_list1:
        movie_name = each.find('span',attrs={"class":"title"}).get_text()
        locals()['movie'+str(j)] = []
        locals()['movie' + str(j)].append(movie_name)
        link = each.a
        movie_url = link.get('href')
        locals()['movie' + str(j)].append(movie_url)
        j = j + 1

    for each in div_list2[1:]:
        movie_rank = each.find('span', attrs={"class":"rating_num"}).get_text()
        locals()['movie'+str(u)].append(movie_rank)
        if each.find('span', attrs={"class": "inq"}) is not None:
            movie_quote = each.find('span', attrs={"class": "inq"}).get_text()
            locals()['movie' + str(u)].append(movie_quote)
        else:
            locals()['movie' + str(u)].append('本电影无一句话简介')
        u = u + 1
    for each in pic_list:
        pic_url = each.find('img')
        movie_pic_url = pic_url.get('src')
        locals()['movie' + str(p)].append(movie_pic_url)
        movie_pic_name = locals()['movie' + str(p)][0]
        if os.path.exists('movie-pic\\%s.jpg' % movie_pic_name) is False:
            request.urlretrieve(movie_pic_url, 'movie-pic\\%s.jpg' % movie_pic_name)
            print('%s海报下载完成' % movie_pic_name)
        else:
            print('%s海报已下载，无需重复下载' % movie_pic_name)
        p = p + 1
if os.path.exists('douban-top250.xls') is False:
    file = xlwt.Workbook()
    table = file.add_sheet('douban_top250')
    table.write(0, 0, "排名")
    table.write(0, 1, "电影")
    table.write(0, 2, "评分")
    table.write(0, 3, "简介")
    table.write(0, 4, "链接")
    table.write(0, 5, "海报链接")
    for g in range(0, 250):
        table.write(g + 1, 0, g + 1)
        table.write(g + 1, 1, locals()['movie' + str(g + 1)][0])
        table.write(g + 1, 2, locals()['movie' + str(g + 1)][2])
        table.write(g + 1, 3, locals()['movie' + str(g + 1)][3])
        table.write(g + 1, 4, locals()['movie' + str(g + 1)][1])
        table.write(g + 1, 5, locals()['movie' + str(g + 1)][4])

    file.save('douban-top250.xls')
    print('数据已经保存在douban-top250.xls中')

else:
    print('\n数据已爬取完成，无需重复爬取\n')
db = pymysql.Connect(host='localhost', port=3307, user='root', password='root', database='douban', charset='utf8')
cursor = db.cursor()
db.begin()
for t in range(0, 250):
    sql = 'select top from db_doubantop250 where top = %s' % (t + 1)
    cursor.execute(sql)
    db.commit()
    top = cursor.fetchone()
    sql = 'select movie_name from db_doubantop250 where top = %s' % (t + 1)
    cursor.execute(sql)
    db.commit()
    movie_name_sql = cursor.fetchone()
    if top == (t + 1,):
        print('%s已存在,无需插入数据库' % (movie_name_sql))
    else:
        sql = 'insert into db_doubantop250 (top,movie_name,rank,info,url,pic_url) values (\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\')' % (
            t + 1, locals()['movie' + str(t + 1)][0], locals()['movie' + str(t + 1)][2],
            pymysql.escape_string(locals()['movie' + str(t + 1)][3]), locals()['movie' + str(t + 1)][1],
            locals()['movie' + str(t + 1)][4])
        cursor.execute(sql)
        db.commit()
print('\n数据库内没有的数据均插入成功')